import pandas as pd
import matplotlib.dates as mdates
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
import warnings
import itertools
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import het_breuschpagan
# Ignore all warnings
warnings.filterwarnings("ignore")
import arima_prophet_functions
Importing plotly failed. Interactive plots will not work.
Этот ноутбук показывает обучение лучших ARIMA и Prophet моделей для данных загрузки ОНМК центров Санкт-Петербурга.
Данные представляют собой временной ряд месячной - загрузки центров с января 2017 по ноябрь 2024 года.
Оценка будет происходить суммарной загрузки центров в месяц, инными словами оценка будет проводиться для всех госпиталей сразу, не по отдельности.
Считываем данные и задаём основную колонку для анализа
!ls data
# MICE RF с госпиталями
data_path_MR = "data/data_summarized_by_month_filled_hosp_MR.tsv"
data_hosp_MR = pd.read_csv(data_path_MR, sep="\t")
F1.csv F1.xlsx data_summarized_by_month_filled.tsv data_summarized_by_month_filled_hosp.tsv data_summarized_by_month_filled_hosp_MR.tsv
main_column = 'Пролечены_с_ОНМК_Всего'
# MICE RF with hospitals
data_hosp_MR_used = data_hosp_MR.copy()
# Убираем ненужный госпиталь
data_hosp_MR_used = data_hosp_MR_used.loc[~data_hosp_MR_used.hosp.isin(["med1"])]
# Добавляем колонки с датой
data_hosp_MR_used.loc[:, 'Date'] = pd.to_datetime(data_hosp_MR_used[['year', 'month']].assign(Day=1))
data_hosp_MR_used.loc[:, 'Date_Graph'] = pd.to_datetime(data_hosp_MR_used[['year', 'month']].assign(Day=1)).dt.strftime('%Y-%m')
# Берем основные колонки
data_hosp_MR_used = data_hosp_MR_used.loc[:, [main_column, 'Date', 'Date_Graph', 'year', 'month']]
# Группируем и суммируем по госпиталям
data_hosp_MR_used = data_hosp_MR_used.groupby(['Date', 'Date_Graph', 'year', 'month']).sum().reset_index()
# Берем все время до ноября 2024
data_hosp_MR_used = data_hosp_MR_used.loc[data_hosp_MR_used.Date < pd.to_datetime("2024-10-31")]
# import matplotlib.pyplot as plt
# import numpy as np
# # Create the figure and axes
# fig, axes = plt.subplots(1, 1, figsize=(12, 12), sharex=True)
# axes = axes.flatten()
# # Define the datasets and titles
# datasets = [data_basic_used, data_hosp_used, data_hosp_MR_used]
# titles = ["All_Hospitals 2017-2024 (Basic)",
# "All_Hospitals 2017-2024 (Hospitals)",
# "All_Hospitals 2017-2024 (MR Hospitals)"]
# # Loop through axes and datasets
# for i, ax in enumerate(axes):
# # Copy the data to avoid modifying the original
# data_plot = datasets[i].copy()
# # Plot the data
# ax.plot(data_plot['Date_Graph'], data_plot[main_column],
# label="All_Hospitals", marker='o', color="gray")
# # Set the title and labels
# ax.set_title(titles[i], fontsize=14)
# ax.set_ylabel("Hospital Load", fontsize=12)
# # Set the xticks and labels
# xticks = np.arange(len(data_plot['Date_Graph'])) # Index positions
# skip_ticks = xticks[::2] # Take every second tick
# ax.set_xticks(skip_ticks)
# ax.set_xticklabels(data_plot['Date_Graph'].iloc[skip_ticks], rotation=90)
# # Add grid and legend
# ax.grid(True, which='both', linestyle='--', linewidth=0.5)
# ax.legend(fontsize=12)
# # Set shared x-axis label
# axes[-1].set_xlabel("Year-Month", fontsize=12)
# # Adjust layout and add a super title
# plt.tight_layout(rect=[0, 0, 1, 0.96]) # Adjust for supertitle
# plt.suptitle("Hospital Load Over Time (2017-2024)", fontsize=16)
# # Show the plot
# plt.show()
Изучать наш временной ряд будем на отрезке 2017-2023 года. 2024 год будет отложен для тестирования модели.
data_used_model = data_hosp_MR_used.copy()
data_used_train = data_used_model.loc[data_used_model.Date < pd.to_datetime("2024-01-01")]
data_used_test = data_used_model.loc[data_used_model.Date >= pd.to_datetime("2024-01-01")]
data_ts = data_hosp_MR_used.loc[data_hosp_MR_used.year.isin([2017, 2018, 2019, 2020, 2021, 2022, 2023])][[main_column, 'Date']]
data_ts = pd.Series(data_ts[main_column])
data_ts.index = pd.date_range(start="2017-01-01", periods=84, freq="M")
# Decompose the time series
decomposition = seasonal_decompose(data_ts, model='additive')
seasonal = decomposition.seasonal
residual = decomposition.resid
# Calculate variances
residual_variance = np.var(residual.dropna())
seasonal_variance = np.var(seasonal.dropna())
# Calculate Seasonal Strength
seasonal_strength = 1 - (residual_variance / (residual_variance + seasonal_variance))
print(f"Seasonal Strength: {seasonal_strength:.3f}")
fig = decomposition.plot() # `decomposition.plot()` returns a Figure
fig.set_size_inches(6, 6) # Optional: Adjust the figure size
# Rotate x-axis tick labels for each subplot
for ax in fig.axes:
for label in ax.get_xticklabels():
label.set_rotation(45)
plt.tight_layout() # Adjust layout to prevent overlap
plt.show()
Seasonal Strength: 0.306
from statsmodels.tsa.stattools import adfuller
data_plot = data_hosp_MR_used.copy()
data_plot.set_index('Date', inplace=True)
data_plot = data_plot.dropna().fillna(0)
# Perform the Dickey-Fuller test
result = adfuller(data_plot[main_column])
# Display results
print(f'*** {"All_Hospitals"} ***')
print("Dickey-Fuller Test Results:")
print(f"Test Statistic: {result[0]}")
print(f"p-value: {result[1]}")
print(f"Number of Lags Used: {result[2]}")
print(f"Number of Observations Used: {result[3]}")
print("Critical Values:")
for key, value in result[4].items():
print(f" {key}: {value}")
*** All_Hospitals *** Dickey-Fuller Test Results: Test Statistic: -3.036366478552415 p-value: 0.031634988394189344 Number of Lags Used: 9 Number of Observations Used: 84 Critical Values: 1%: -3.510711795769895 5%: -2.8966159448223734 10%: -2.5854823866213152
Видим стационарность, но не слабую p=0.03, Попробуем продифференцировать ряд.
plt.plot(data_plot[main_column])
[<matplotlib.lines.Line2D at 0x7f199b29c8b0>]
data_plot = data_hosp_MR_used.copy()
data_plot.set_index('Date', inplace=True)
data_plot = data_plot.dropna().fillna(0)
# дифференцирование
series = data_plot[main_column].diff(1).dropna()
# Perform the Dickey-Fuller test
result = adfuller(series)
# Display results
print(f'*** {"All_Hospitals"} ***')
print("Dickey-Fuller Test Results:")
print(f"Test Statistic: {result[0]}")
print(f"p-value: {result[1]}")
print(f"Number of Lags Used: {result[2]}")
print(f"Number of Observations Used: {result[3]}")
print("Critical Values:")
for key, value in result[4].items():
print(f" {key}: {value}")
*** All_Hospitals *** Dickey-Fuller Test Results: Test Statistic: -4.928434258839517 p-value: 3.0692049629707045e-05 Number of Lags Used: 5 Number of Observations Used: 87 Critical Values: 1%: -3.5078527246648834 5%: -2.895382030636155 10%: -2.584823877658872
plt.plot(data_plot[main_column].diff(1).dropna())
[<matplotlib.lines.Line2D at 0x7f1999340040>]
Оп, уже стационарность побольше! И ряд сам "выравнивается" - нет линейных трендов, что хорошо для ARIMA моделей. Поэтому будем использовать дифференцированный ряд.
# Select the Load time series
load_series = pd.Series(data_used_train[main_column]).diff(1).dropna()
# Plot ACF and PACF
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# ACF plot
plot_acf(load_series, ax=axes[0], lags=40) # Adjust lags as needed
axes[0].set_title('Autocorrelation Function (ACF)')
axes[0].set_xticks(range(0, 40, 2))
# PACF plot
plot_pacf(load_series, ax=axes[1], lags=40, method='ywm') # 'ywm' is robust for small samples
axes[1].set_title('Partial Autocorrelation Function (PACF)')
axes[1].set_xticks(range(0, 40, 2))
plt.tight_layout()
plt.show()
Что тут видим: ...
Так как работа пакета "auto_arima" меня ну строила, я написал собственный грид-серч с использованием itertools.
Ниже будет показан тестовый прогон моего кода:
# Example usage
# Assuming `time_series_data` is your pd.Series object
results = arima_prophet_functions.grid_search_arima(
data=data_used_train[main_column],
min_p=0,
max_p=5,
min_q=0,
max_q=5,
d=1,
min_P=0,
max_P=0,
min_Q=0,
max_Q=0,
min_D=0,
max_D=0,
seasonal_period=12,
trends=["n"]
)
results.sort_values('AIC').head()
| p | q | d | P | Q | D | seasonal_period | trend | AIC | |
|---|---|---|---|---|---|---|---|---|---|
| 29 | 4 | 5 | 1 | 0 | 0 | 0 | 12 | n | 1045.354399 |
| 17 | 2 | 5 | 1 | 0 | 0 | 0 | 12 | n | 1047.165300 |
| 35 | 5 | 5 | 1 | 0 | 0 | 0 | 12 | n | 1047.412144 |
| 23 | 3 | 5 | 1 | 0 | 0 | 0 | 12 | n | 1047.771729 |
| 5 | 0 | 5 | 1 | 0 | 0 | 0 | 12 | n | 1054.054548 |
# Example usage
# Assuming `time_series_data` is your pd.Series object
results = arima_prophet_functions.grid_search_arima(
data=data_used_train[main_column],
min_p=0,
max_p=5,
min_q=0,
max_q=5,
d=1,
min_P=1,
max_P=1,
min_Q=1,
max_Q=1,
min_D=0,
max_D=0,
seasonal_period=12,
trends=["n"]
)
results.sort_values('AIC').head()
| p | q | d | P | Q | D | seasonal_period | trend | AIC | |
|---|---|---|---|---|---|---|---|---|---|
| 29 | 4 | 5 | 1 | 1 | 1 | 0 | 12 | n | 883.948428 |
| 17 | 2 | 5 | 1 | 1 | 1 | 0 | 12 | n | 884.057333 |
| 23 | 3 | 5 | 1 | 1 | 1 | 0 | 12 | n | 886.649834 |
| 35 | 5 | 5 | 1 | 1 | 1 | 0 | 12 | n | 887.223640 |
| 5 | 0 | 5 | 1 | 1 | 1 | 0 | 12 | n | 889.524956 |
sns.boxplot(data=results, y="AIC", x="q")
<Axes: xlabel='q', ylabel='AIC'>
Модуль обучения различных ARIMA моделей и поиска лучшей
data_used_model = data_hosp_MR_used.copy()
train_month=84 #2017-2023
data_used_train = data_used_model.iloc[:train_month]
data_used_test = data_used_model.iloc[train_month:]
model = arima_prophet_functions.train_arima_model(data_used_model[main_column], order=(1, 1, 1), train_month=train_month, trend='n')
data_check_metrics = arima_prophet_functions.make_metrics_dataframe(model, data_used_train, data_used_test, main_column)
print(model.summary())
metrics_df = arima_prophet_functions.calculate_model_metrics(model, data_check_metrics, main_column)
metrics_df
SARIMAX Results
==================================================================================
Dep. Variable: Пролечены_с_ОНМК_Всего No. Observations: 84
Model: SARIMAX(1, 1, 1) Log Likelihood -554.497
Date: Thu, 30 Jan 2025 AIC 1114.995
Time: 19:51:15 BIC 1122.178
Sample: 0 HQIC 1117.877
- 84
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.2334 0.131 -1.779 0.075 -0.491 0.024
ma.L1 -0.8828 0.066 -13.475 0.000 -1.011 -0.754
sigma2 5.131e+04 9767.889 5.253 0.000 3.22e+04 7.05e+04
===================================================================================
Ljung-Box (L1) (Q): 0.04 Jarque-Bera (JB): 1.42
Prob(Q): 0.85 Prob(JB): 0.49
Heteroskedasticity (H): 0.96 Skew: 0.17
Prob(H) (two-sided): 0.91 Kurtosis: 2.44
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
*** TRAIN ***
MAE: 214.14
MSE: 82432.94
RMSE: 287.11
MAPE: 12.82%
AIC: 1114.9945345742562
*** TEST ***
MAE: 217.73
MSE: 59746.17
RMSE: 244.43
MAPE: 15.10%
| Dataset | TRAIN | TEST |
|---|---|---|
| MAE | 214.137296 | 217.734950 |
| MSE | 82432.935135 | 59746.172263 |
| RMSE | 287.111364 | 244.430301 |
| MAPE (%) | 12.819832 | 15.103578 |
| AIC | 1114.994535 | NaN |
arima_prophet_functions.model_performance(data_check_metrics, main_column)
Ljung-Box Test
lb_stat lb_pvalue
10 12.362577 0.261523
arima_prophet_functions.plot_model_results(data_check_metrics, main_column,
date_column='Date_Graph',
prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
cutoff_date="12-30-2023", vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None)
train_month_start = 84
train_month_end = 94
final_data_check_metrics, model_first = arima_prophet_functions.rolling_prediction_function(data_used_model, main_column, (1,1,1),
train_month_start=train_month_start,
train_month_end=train_month_end)
arima_prophet_functions.calculate_model_metrics(model_first, final_data_check_metrics, main_column)
*** TRAIN *** MAE: 214.14 MSE: 82432.94 RMSE: 287.11 MAPE: 12.82% AIC: 1114.9945345742562 *** TEST *** MAE: 199.50 MSE: 45576.68 RMSE: 213.49 MAPE: 13.53%
| Dataset | TRAIN | TEST |
|---|---|---|
| MAE | 214.137296 | 199.501572 |
| MSE | 82432.935135 | 45576.676463 |
| RMSE | 287.111364 | 213.486947 |
| MAPE (%) | 12.819832 | 13.531049 |
| AIC | 1114.994535 | NaN |
arima_prophet_functions.plot_model_results(final_data_check_metrics, main_column,
date_column='Date_Graph',
prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
cutoff_date="12-30-2023", vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None)
data_used_model = data_hosp_MR_used.copy()
train_month=84 #2017-2023
data_used_train = data_used_model.iloc[:train_month]
data_used_test = data_used_model.iloc[train_month:]
model = arima_prophet_functions.train_arima_model(data_used_model[main_column], order=(2, 1, 3), train_month=train_month, trend='n')
data_check_metrics = arima_prophet_functions.make_metrics_dataframe(model, data_used_train, data_used_test, main_column)
print(model.summary())
metrics_df = arima_prophet_functions.calculate_model_metrics(model, data_check_metrics, main_column)
metrics_df
SARIMAX Results
==================================================================================
Dep. Variable: Пролечены_с_ОНМК_Всего No. Observations: 84
Model: SARIMAX(2, 1, 3) Log Likelihood -529.077
Date: Thu, 30 Jan 2025 AIC 1070.153
Time: 19:51:17 BIC 1084.370
Sample: 0 HQIC 1075.849
- 84
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -1.1369 0.017 -65.637 0.000 -1.171 -1.103
ar.L2 -0.9950 0.019 -51.576 0.000 -1.033 -0.957
ma.L1 0.3204 0.139 2.309 0.021 0.048 0.592
ma.L2 -0.1072 0.123 -0.874 0.382 -0.348 0.133
ma.L3 -0.9042 0.200 -4.515 0.000 -1.297 -0.512
sigma2 3.546e+04 8.05e-06 4.41e+09 0.000 3.55e+04 3.55e+04
===================================================================================
Ljung-Box (L1) (Q): 0.32 Jarque-Bera (JB): 3.66
Prob(Q): 0.57 Prob(JB): 0.16
Heteroskedasticity (H): 0.80 Skew: -0.32
Prob(H) (two-sided): 0.58 Kurtosis: 2.16
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 4.09e+25. Standard errors may be unstable.
*** TRAIN ***
MAE: 203.51
MSE: 98045.29
RMSE: 313.12
MAPE: 12.13%
AIC: 1070.153429480983
*** TEST ***
MAE: 147.26
MSE: 32045.75
RMSE: 179.01
MAPE: 10.00%
| Dataset | TRAIN | TEST |
|---|---|---|
| MAE | 203.510432 | 147.257832 |
| MSE | 98045.286626 | 32045.747262 |
| RMSE | 313.121840 | 179.013260 |
| MAPE (%) | 12.129705 | 10.000349 |
| AIC | 1070.153429 | NaN |
arima_prophet_functions.model_performance(data_check_metrics, main_column)
Ljung-Box Test
lb_stat lb_pvalue
10 20.968363 0.021315
arima_prophet_functions.plot_model_results(data_check_metrics, main_column,
date_column='Date_Graph',
prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
cutoff_date="12-30-2023", vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None)
final_data_check_metrics, model_first = arima_prophet_functions.rolling_prediction_function(data_used_model, main_column, (2,1,3),
train_month_start=train_month_start,
train_month_end=train_month_end)
arima_prophet_functions.calculate_model_metrics(model_first, final_data_check_metrics, main_column)
*** TRAIN *** MAE: 203.51 MSE: 98045.29 RMSE: 313.12 MAPE: 12.13% AIC: 1070.153429480983 *** TEST *** MAE: 121.60 MSE: 24424.27 RMSE: 156.28 MAPE: 8.16%
| Dataset | TRAIN | TEST |
|---|---|---|
| MAE | 203.510432 | 121.601240 |
| MSE | 98045.286626 | 24424.271559 |
| RMSE | 313.121840 | 156.282666 |
| MAPE (%) | 12.129705 | 8.158376 |
| AIC | 1070.153429 | NaN |
final_data_check_metrics
| Date | Date_Graph | Пролечены_с_ОНМК_Всего | Пролечены_с_ОНМК_Всего_Fitted | Пролечены_с_ОНМК_Всего_Prediction | Пролечены_с_ОНМК_Всего_Prediction_CI_low | Пролечены_с_ОНМК_Всего_Prediction_CI_upp | |
|---|---|---|---|---|---|---|---|
| 0 | 2017-01-01 | 2017-01 | 1446.0 | 0.000000 | None | None | None |
| 1 | 2017-02-01 | 2017-02 | 1859.0 | 624.025294 | None | None | None |
| 2 | 2017-03-01 | 2017-03 | 2285.0 | 1093.769856 | None | None | None |
| 3 | 2017-04-01 | 2017-04 | 1678.0 | 1397.157963 | None | None | None |
| 4 | 2017-05-01 | 2017-05 | 1657.0 | 1921.589855 | None | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 89 | 2024-06-01 | 2024-06 | 1446.0 | NaN | 1551.242293 | 1162.663339 | 1939.821247 |
| 90 | 2024-07-01 | 2024-07 | 1349.0 | NaN | 1459.59544 | 1073.438359 | 1845.752521 |
| 91 | 2024-08-01 | 2024-08 | 1802.0 | NaN | 1830.500051 | 1443.776983 | 2217.22312 |
| 92 | 2024-09-01 | 2024-09 | 1322.0 | NaN | 1458.503434 | 1094.459409 | 1822.54746 |
| 93 | 2024-10-01 | 2024-10 | 1729.0 | NaN | 1457.02495 | 1094.054345 | 1819.995554 |
94 rows × 7 columns
arima_prophet_functions.plot_model_results(final_data_check_metrics, main_column,
date_column='Date_Graph',
prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
cutoff_date="12-30-2023",
ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None)
data_used_model = data_hosp_MR_used.copy()
train_month=84 #2017-2023
data_used_train = data_used_model.iloc[:train_month]
data_used_test = data_used_model.iloc[train_month:]
model = arima_prophet_functions.train_arima_model(data_used_model[main_column], order=(2, 1, 3), train_month=train_month, trend='n',
seasonal_order=(1, 0, 1, 12))
data_check_metrics = arima_prophet_functions.make_metrics_dataframe(model, data_used_train, data_used_test, main_column)
print(model.summary())
metrics_df = arima_prophet_functions.calculate_model_metrics(model, data_check_metrics, main_column)
metrics_df
SARIMAX Results
============================================================================================
Dep. Variable: Пролечены_с_ОНМК_Всего No. Observations: 84
Model: SARIMAX(2, 1, 3)x(1, 0, [1], 12) Log Likelihood -445.405
Date: Thu, 30 Jan 2025 AIC 906.810
Time: 19:51:18 BIC 924.447
Sample: 0 HQIC 913.789
- 84
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -1.1596 0.016 -73.907 0.000 -1.190 -1.129
ar.L2 -1.0245 0.024 -43.393 0.000 -1.071 -0.978
ma.L1 0.4167 0.366 1.138 0.255 -0.301 1.134
ma.L2 0.0988 0.403 0.246 0.806 -0.690 0.888
ma.L3 -0.7514 0.481 -1.563 0.118 -1.694 0.191
ar.S.L12 0.6709 0.126 5.344 0.000 0.425 0.917
ma.S.L12 -0.7560 0.428 -1.766 0.077 -1.595 0.083
sigma2 3.129e+04 1.86e+04 1.679 0.093 -5239.739 6.78e+04
===================================================================================
Ljung-Box (L1) (Q): 1.34 Jarque-Bera (JB): 1.22
Prob(Q): 0.25 Prob(JB): 0.54
Heteroskedasticity (H): 0.77 Skew: -0.26
Prob(H) (two-sided): 0.54 Kurtosis: 2.59
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
*** TRAIN ***
MAE: 217.10
MSE: 110861.39
RMSE: 332.96
MAPE: 12.83%
AIC: 906.8095059859434
*** TEST ***
MAE: 112.86
MSE: 16697.32
RMSE: 129.22
MAPE: 7.36%
| Dataset | TRAIN | TEST |
|---|---|---|
| MAE | 217.096661 | 112.856308 |
| MSE | 110861.391485 | 16697.321712 |
| RMSE | 332.958543 | 129.218117 |
| MAPE (%) | 12.832502 | 7.362880 |
| AIC | 906.809506 | NaN |
arima_prophet_functions.plot_model_results(data_check_metrics, main_column,
date_column='Date_Graph',
prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
cutoff_date="12-30-2023", vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None)
arima_prophet_functions.model_performance(data_check_metrics, main_column)
Ljung-Box Test
lb_stat lb_pvalue
10 18.975142 0.04058
final_data_check_metrics, model_first = arima_prophet_functions.rolling_prediction_function(data_used_model, main_column, (2,1,3), seasonal_order=(1, 0, 1, 12), train_month_start=84, train_month_end=94)
arima_prophet_functions.calculate_model_metrics(model_first, final_data_check_metrics, main_column)
*** TRAIN *** MAE: 217.10 MSE: 110861.39 RMSE: 332.96 MAPE: 12.83% AIC: 906.8095059859434 *** TEST *** MAE: 108.89 MSE: 17436.32 RMSE: 132.05 MAPE: 7.02%
| Dataset | TRAIN | TEST |
|---|---|---|
| MAE | 217.096661 | 108.888046 |
| MSE | 110861.391485 | 17436.315568 |
| RMSE | 332.958543 | 132.046642 |
| MAPE (%) | 12.832502 | 7.019480 |
| AIC | 906.809506 | NaN |
arima_prophet_functions.plot_model_results(final_data_check_metrics, main_column,
date_column='Date_Graph',
prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
cutoff_date="12-30-2023",
ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None)
data_used_model['COVID'] = data_used_model['year'] == 2020
data_used_model['COVID'] = data_used_model['COVID'].astype(int)
train_month=84 #2017-2023
data_used_train = data_used_model.iloc[:train_month]
data_used_test = data_used_model.iloc[train_month:]
model = arima_prophet_functions.train_arima_model(data_used_model[main_column], order=(2, 1, 3), trend="n", seasonal_order=(1, 0, 1, 12),
exog=data_used_model['COVID'], train_month=84)
data_check_metrics = arima_prophet_functions.make_metrics_dataframe(model, data_used_train, data_used_test, main_column, exog_test=np.array(data_used_test['COVID']))
print(model.summary())
metrics_df = arima_prophet_functions.calculate_model_metrics(model, data_check_metrics, main_column)
metrics_df
SARIMAX Results
============================================================================================
Dep. Variable: Пролечены_с_ОНМК_Всего No. Observations: 84
Model: SARIMAX(2, 1, 3)x(1, 0, [1], 12) Log Likelihood -445.533
Date: Thu, 30 Jan 2025 AIC 909.066
Time: 19:51:22 BIC 928.908
Sample: 0 HQIC 916.918
- 84
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
x1 55.4004 110.798 0.500 0.617 -161.759 272.560
ar.L1 -1.1615 0.014 -83.646 0.000 -1.189 -1.134
ar.L2 -1.0268 0.020 -50.661 0.000 -1.067 -0.987
ma.L1 0.4316 6.096 0.071 0.944 -11.516 12.379
ma.L2 0.0841 5.512 0.015 0.988 -10.720 10.888
ma.L3 -0.7658 7.783 -0.098 0.922 -16.021 14.489
ar.S.L12 0.6622 0.101 6.567 0.000 0.465 0.860
ma.S.L12 -1.0187 5.655 -0.180 0.857 -12.103 10.065
sigma2 2.264e+04 2.15e+05 0.105 0.916 -3.99e+05 4.45e+05
===================================================================================
Ljung-Box (L1) (Q): 0.81 Jarque-Bera (JB): 1.23
Prob(Q): 0.37 Prob(JB): 0.54
Heteroskedasticity (H): 0.77 Skew: -0.22
Prob(H) (two-sided): 0.55 Kurtosis: 2.50
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 7.12e+14. Standard errors may be unstable.
*** TRAIN ***
MAE: 218.59
MSE: 111355.65
RMSE: 333.70
MAPE: 12.93%
AIC: 909.0662050492497
*** TEST ***
MAE: 118.64
MSE: 17788.33
RMSE: 133.37
MAPE: 7.63%
| Dataset | TRAIN | TEST |
|---|---|---|
| MAE | 218.589756 | 118.636750 |
| MSE | 111355.647097 | 17788.328890 |
| RMSE | 333.699936 | 133.372894 |
| MAPE (%) | 12.925759 | 7.627024 |
| AIC | 909.066205 | NaN |
arima_prophet_functions.plot_model_results(data_check_metrics, main_column,
date_column='Date_Graph',
prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
cutoff_date="12-30-2023", vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None)
arima_prophet_functions.model_performance(data_check_metrics, main_column)
Ljung-Box Test
lb_stat lb_pvalue
10 19.137908 0.038546
final_data_check_metrics, model_first = arima_prophet_functions.rolling_prediction_function(data_used_model, main_column, (2,1,3),
seasonal_order=(1,0,1,12), train_month_start=84,
train_month_end=94, exog=data_used_model['COVID'])
arima_prophet_functions.calculate_model_metrics(model_first, final_data_check_metrics, main_column)
*** TRAIN *** MAE: 218.59 MSE: 111355.65 RMSE: 333.70 MAPE: 12.93% AIC: 909.0662050492497 *** TEST *** MAE: 102.91 MSE: 15350.82 RMSE: 123.90 MAPE: 6.66%
| Dataset | TRAIN | TEST |
|---|---|---|
| MAE | 218.589756 | 102.912664 |
| MSE | 111355.647097 | 15350.818098 |
| RMSE | 333.699936 | 123.898418 |
| MAPE (%) | 12.925759 | 6.657733 |
| AIC | 909.066205 | NaN |
arima_prophet_functions.plot_model_results(final_data_check_metrics, main_column,
date_column='Date_Graph',
prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
cutoff_date="12-30-2023",
ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None)
from arch import arch_model
residuals = data_check_metrics['residuals'].dropna()
# Fit GARCH model on residuals
garch_model = arch_model(residuals, vol='Garch', p=1, q=1)
# Fitting
garch_results = garch_model.fit(disp='off')
# Print model summary
print(garch_results.summary())
Constant Mean - GARCH Model Results
==============================================================================
Dep. Variable: residuals R-squared: 0.000
Mean Model: Constant Mean Adj. R-squared: 0.000
Vol Model: GARCH Log-Likelihood: -578.736
Distribution: Normal AIC: 1165.47
Method: Maximum Likelihood BIC: 1175.20
No. Observations: 84
Date: Thu, Jan 30 2025 Df Residuals: 83
Time: 19:51:25 Df Model: 1
Mean Model
========================================================================
coef std err t P>|t| 95.0% Conf. Int.
------------------------------------------------------------------------
mu 10.0397 25.757 0.390 0.697 [-40.444, 60.523]
Volatility Model
=============================================================================
coef std err t P>|t| 95.0% Conf. Int.
-----------------------------------------------------------------------------
omega 2193.5119 8057.773 0.272 0.785 [-1.360e+04,1.799e+04]
alpha[1] 0.0000 0.265 0.000 1.000 [ -0.519, 0.519]
beta[1] 0.9180 0.439 2.089 3.667e-02 [5.686e-02, 1.779]
=============================================================================
Covariance estimator: robust
# Get the conditional volatility (sigma_t)
data_check_metrics['Volatility'] = garch_results.conditional_volatility
# Plot the volatility
plt.figure(figsize=(12, 5))
plt.plot(data_check_metrics['Date'], data_check_metrics['residuals'])
plt.plot(data_check_metrics['Date'], data_check_metrics['Volatility'], label="Estimated Volatility", color='orange')
plt.legend()
plt.title("GARCH Estimated Volatility")
plt.show()
# Forecast volatility for the next 12 periods (months, if monthly data)
forecast_horizon = 10
garch_forecast = garch_results.forecast(horizon=forecast_horizon)
# Extract forecasted conditional variance (square root to get standard deviation)
forecast_volatility = np.sqrt(garch_forecast.variance.values[:, :])
print("Forecasted Volatility for next periods:", forecast_volatility)
Forecasted Volatility for next periods: [[164.31716041 164.25875607 164.20511977 164.15586361 164.11063106 164.06909441 164.03095249 163.99592854 163.96376828 163.93423804]]
garch_forecast.variance
| h.01 | h.02 | h.03 | h.04 | h.05 | h.06 | h.07 | h.08 | h.09 | h.10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 83 | 27000.129204 | 26980.938945 | 26963.321358 | 26947.147559 | 26932.299227 | 26918.66774 | 26906.153373 | 26894.664579 | 26884.117308 | 26874.434403 |
from prophet import Prophet
# Подготовка данных
data_used_model = data_hosp_MR_used.copy()
# Train/Test split
data_model_train = data_used_model.loc[data_used_model.Date < pd.to_datetime("2024-01-01")].rename(columns={'Date': 'ds', main_column: 'y'})
data_model_test = data_used_model.loc[data_used_model.Date >= pd.to_datetime("2024-01-01")].rename(columns={'Date': 'ds', main_column: 'y'})
model = Prophet(growth='linear',
yearly_seasonality=True,
weekly_seasonality=True,
daily_seasonality=False,
interval_width=0.8,
changepoint_prior_scale=0.05,
n_changepoints=25,
seasonality_mode='multiplicative')
model, data_check_metrics, forecast = arima_prophet_functions.prophet_model_train(model, data_model_train, data_model_test, main_column)
metrics_df = arima_prophet_functions.calculate_model_metrics_prophet(data_check_metrics, main_column)
# metrics_df['TEST'].loc['MAPE (%)'] = 9.554020571855224
19:51:25 - cmdstanpy - INFO - Chain [1] start processing 19:51:25 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 106.41 MSE: 18001.68 RMSE: 134.17 MAPE: 6.42% AIC: 1161.4321715226604 *** TEST *** MAE: 138.00 MSE: 46513.27 RMSE: 215.67 MAPE: 9.71%
param_grid = {
'weekly_seasonality': [False, True],
'daily_seasonality': [False, True],
'changepoint_prior_scale': [0.01, 0.05, 0.1],
'n_changepoints': [10, 25, 50],
'seasonality_mode': ['additive', 'multiplicative']
}
grid = list(itertools.product(*param_grid.values()))
mape_lst = []
for param in grid:
model = Prophet(growth='linear',
yearly_seasonality=True,
weekly_seasonality=param[0],
daily_seasonality=param[1],
interval_width=0.8,
changepoint_prior_scale=param[2],
n_changepoints=param[3],
seasonality_mode=param[4])
model, data_check_metrics, forecast = arima_prophet_functions.prophet_model_train(model, data_model_train, data_model_test, main_column, plotting=False)
metrics_df = arima_prophet_functions.calculate_model_metrics_prophet(data_check_metrics, main_column)
dict_app = {"param": param, "mape_train":metrics_df['TRAIN'].loc['MAPE (%)'], "mape_test":metrics_df['TEST'].loc['MAPE (%)']}
mape_lst.append(dict_app)
19:51:26 - cmdstanpy - INFO - Chain [1] start processing 19:51:26 - cmdstanpy - INFO - Chain [1] done processing 19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.40 MSE: 32801.00 RMSE: 181.11 MAPE: 8.36% AIC: 1211.8316667542454 *** TEST *** MAE: 201.15 MSE: 69661.03 RMSE: 263.93 MAPE: 13.66%
19:51:26 - cmdstanpy - INFO - Chain [1] done processing 19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.59 MSE: 32767.79 RMSE: 181.02 MAPE: 8.38% AIC: 1211.7465757465359 *** TEST *** MAE: 196.31 MSE: 66701.20 RMSE: 258.27 MAPE: 13.34%
19:51:26 - cmdstanpy - INFO - Chain [1] done processing 19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.40 MSE: 32800.44 RMSE: 181.11 MAPE: 8.36% AIC: 1211.8302326700466 *** TEST *** MAE: 201.45 MSE: 69802.37 RMSE: 264.20 MAPE: 13.68%
19:51:26 - cmdstanpy - INFO - Chain [1] done processing 19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.59 MSE: 32767.90 RMSE: 181.02 MAPE: 8.37% AIC: 1211.7468690093963 *** TEST *** MAE: 196.00 MSE: 66580.07 RMSE: 258.03 MAPE: 13.32%
19:51:26 - cmdstanpy - INFO - Chain [1] done processing 19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.39 MSE: 32800.62 RMSE: 181.11 MAPE: 8.36% AIC: 1211.8307084322405 *** TEST *** MAE: 202.21 MSE: 70169.69 RMSE: 264.90 MAPE: 13.74%
19:51:27 - cmdstanpy - INFO - Chain [1] done processing 19:51:27 - cmdstanpy - INFO - Chain [1] start processing 19:51:27 - cmdstanpy - INFO - Chain [1] done processing 19:51:27 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.56 MSE: 32770.58 RMSE: 181.03 MAPE: 8.38% AIC: 1211.7537234821168 *** TEST *** MAE: 197.35 MSE: 67083.07 RMSE: 259.00 MAPE: 13.42% *** TRAIN *** MAE: 142.39 MSE: 32794.91 RMSE: 181.09 MAPE: 8.36% AIC: 1211.8160710657176 *** TEST *** MAE: 201.75 MSE: 69940.45 RMSE: 264.46 MAPE: 13.71%
19:51:27 - cmdstanpy - INFO - Chain [1] done processing 19:51:27 - cmdstanpy - INFO - Chain [1] start processing 19:51:27 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 142.58 MSE: 32759.43 RMSE: 181.00 MAPE: 8.38% AIC: 1211.7251484645333 *** TEST *** MAE: 196.76 MSE: 66931.26 RMSE: 258.71 MAPE: 13.37%
19:51:27 - cmdstanpy - INFO - Chain [1] start processing 19:51:27 - cmdstanpy - INFO - Chain [1] done processing 19:51:27 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.38 MSE: 32796.04 RMSE: 181.10 MAPE: 8.36% AIC: 1211.8189575866493 *** TEST *** MAE: 201.65 MSE: 69888.98 RMSE: 264.37 MAPE: 13.70% *** TRAIN *** MAE: 142.57 MSE: 32757.38 RMSE: 180.99 MAPE: 8.37% AIC: 1211.719903822935 *** TEST *** MAE: 196.39 MSE: 66745.92 RMSE: 258.35 MAPE: 13.35%
19:51:28 - cmdstanpy - INFO - Chain [1] done processing 19:51:28 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.36 MSE: 32790.47 RMSE: 181.08 MAPE: 8.36% AIC: 1211.8046900707213 *** TEST *** MAE: 201.25 MSE: 69700.30 RMSE: 264.01 MAPE: 13.67%
19:51:28 - cmdstanpy - INFO - Chain [1] done processing 19:51:28 - cmdstanpy - INFO - Chain [1] start processing 19:51:28 - cmdstanpy - INFO - Chain [1] done processing 19:51:28 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.56 MSE: 32746.92 RMSE: 180.96 MAPE: 8.37% AIC: 1211.693060898126 *** TEST *** MAE: 196.58 MSE: 66814.37 RMSE: 258.48 MAPE: 13.36% *** TRAIN *** MAE: 141.31 MSE: 31956.17 RMSE: 178.76 MAPE: 8.30% AIC: 1209.6397957372196 *** TEST *** MAE: 211.42 MSE: 75436.71 RMSE: 274.66 MAPE: 14.43%
19:51:28 - cmdstanpy - INFO - Chain [1] done processing 19:51:28 - cmdstanpy - INFO - Chain [1] start processing 19:51:29 - cmdstanpy - INFO - Chain [1] done processing 19:51:29 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 140.63 MSE: 31548.39 RMSE: 177.62 MAPE: 8.26% AIC: 1208.561008857982 *** TEST *** MAE: 212.72 MSE: 76316.66 RMSE: 276.25 MAPE: 14.53% *** TRAIN *** MAE: 141.30 MSE: 31963.70 RMSE: 178.78 MAPE: 8.30% AIC: 1209.659601587275 *** TEST *** MAE: 211.58 MSE: 75538.79 RMSE: 274.84 MAPE: 14.44%
19:51:29 - cmdstanpy - INFO - Chain [1] done processing 19:51:29 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 140.64 MSE: 31550.61 RMSE: 177.62 MAPE: 8.26% AIC: 1208.5669146851637 *** TEST *** MAE: 212.54 MSE: 76193.88 RMSE: 276.03 MAPE: 14.51%
19:51:29 - cmdstanpy - INFO - Chain [1] done processing 19:51:29 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 141.30 MSE: 31954.82 RMSE: 178.76 MAPE: 8.30% AIC: 1209.6362502297068 *** TEST *** MAE: 211.74 MSE: 75631.62 RMSE: 275.01 MAPE: 14.45%
19:51:30 - cmdstanpy - INFO - Chain [1] done processing 19:51:30 - cmdstanpy - INFO - Chain [1] start processing 19:51:30 - cmdstanpy - INFO - Chain [1] done processing 19:51:30 - cmdstanpy - INFO - Chain [1] start processing 19:51:30 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 140.80 MSE: 31587.62 RMSE: 177.73 MAPE: 8.28% AIC: 1208.6653961907946 *** TEST *** MAE: 211.24 MSE: 75221.15 RMSE: 274.26 MAPE: 14.43% *** TRAIN *** MAE: 142.39 MSE: 32800.34 RMSE: 181.11 MAPE: 8.36% AIC: 1211.829973143091 *** TEST *** MAE: 201.71 MSE: 69919.14 RMSE: 264.42 MAPE: 13.70%
19:51:30 - cmdstanpy - INFO - Chain [1] start processing 19:51:30 - cmdstanpy - INFO - Chain [1] done processing 19:51:30 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.62 MSE: 32768.03 RMSE: 181.02 MAPE: 8.38% AIC: 1211.7471897504593 *** TEST *** MAE: 196.06 MSE: 66583.33 RMSE: 258.04 MAPE: 13.33% *** TRAIN *** MAE: 142.39 MSE: 32800.34 RMSE: 181.11 MAPE: 8.36% AIC: 1211.8299803493007 *** TEST *** MAE: 201.69 MSE: 69912.49 RMSE: 264.41 MAPE: 13.70%
19:51:30 - cmdstanpy - INFO - Chain [1] done processing 19:51:30 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.36 MSE: 32772.97 RMSE: 181.03 MAPE: 8.37% AIC: 1211.7598504525342 *** TEST *** MAE: 196.15 MSE: 66811.70 RMSE: 258.48 MAPE: 13.34%
19:51:31 - cmdstanpy - INFO - Chain [1] done processing 19:51:31 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.45 MSE: 32801.02 RMSE: 181.11 MAPE: 8.37% AIC: 1211.8317235026589 *** TEST *** MAE: 202.53 MSE: 70263.13 RMSE: 265.07 MAPE: 13.75%
19:51:31 - cmdstanpy - INFO - Chain [1] done processing 19:51:31 - cmdstanpy - INFO - Chain [1] start processing 19:51:31 - cmdstanpy - INFO - Chain [1] done processing 19:51:31 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.34 MSE: 32773.47 RMSE: 181.03 MAPE: 8.37% AIC: 1211.7611503883836 *** TEST *** MAE: 196.03 MSE: 66771.15 RMSE: 258.40 MAPE: 13.33% *** TRAIN *** MAE: 142.36 MSE: 32795.33 RMSE: 181.09 MAPE: 8.36% AIC: 1211.817139775301 *** TEST *** MAE: 201.45 MSE: 69818.73 RMSE: 264.23 MAPE: 13.69%
19:51:31 - cmdstanpy - INFO - Chain [1] done processing 19:51:31 - cmdstanpy - INFO - Chain [1] start processing 19:51:31 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 142.58 MSE: 32752.27 RMSE: 180.98 MAPE: 8.37% AIC: 1211.7067887604167 *** TEST *** MAE: 196.34 MSE: 66711.19 RMSE: 258.29 MAPE: 13.35%
19:51:31 - cmdstanpy - INFO - Chain [1] start processing 19:51:32 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 142.39 MSE: 32795.69 RMSE: 181.10 MAPE: 8.36% AIC: 1211.818072129617 *** TEST *** MAE: 201.74 MSE: 69934.01 RMSE: 264.45 MAPE: 13.71%
19:51:32 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.57 MSE: 32751.16 RMSE: 180.97 MAPE: 8.37% AIC: 1211.7039440549875 *** TEST *** MAE: 196.22 MSE: 66653.47 RMSE: 258.17 MAPE: 13.34%
19:51:32 - cmdstanpy - INFO - Chain [1] done processing 19:51:32 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.38 MSE: 32782.34 RMSE: 181.06 MAPE: 8.36% AIC: 1211.7838819515441 *** TEST *** MAE: 201.99 MSE: 70047.40 RMSE: 264.66 MAPE: 13.72%
19:51:32 - cmdstanpy - INFO - Chain [1] done processing 19:51:32 - cmdstanpy - INFO - Chain [1] start processing 19:51:32 - cmdstanpy - INFO - Chain [1] done processing 19:51:33 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 142.47 MSE: 32718.27 RMSE: 180.88 MAPE: 8.37% AIC: 1211.619539455928 *** TEST *** MAE: 196.37 MSE: 66767.81 RMSE: 258.39 MAPE: 13.35% *** TRAIN *** MAE: 141.30 MSE: 31951.45 RMSE: 178.75 MAPE: 8.30% AIC: 1209.6274014812896 *** TEST *** MAE: 211.76 MSE: 75640.61 RMSE: 275.03 MAPE: 14.45%
19:51:33 - cmdstanpy - INFO - Chain [1] done processing 19:51:33 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 126.78 MSE: 25484.13 RMSE: 159.64 MAPE: 7.38% AIC: 1190.6298165258954 *** TEST *** MAE: 166.48 MSE: 43661.03 RMSE: 208.95 MAPE: 11.22%
19:51:33 - cmdstanpy - INFO - Chain [1] done processing 19:51:33 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 141.61 MSE: 32091.08 RMSE: 179.14 MAPE: 8.32% AIC: 1209.9936684566535 *** TEST *** MAE: 209.54 MSE: 74114.53 RMSE: 272.24 MAPE: 14.29%
19:51:34 - cmdstanpy - INFO - Chain [1] done processing 19:51:34 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 127.78 MSE: 25610.41 RMSE: 160.03 MAPE: 7.45% AIC: 1191.0450200695864 *** TEST *** MAE: 167.00 MSE: 43445.94 RMSE: 208.44 MAPE: 11.23%
19:51:34 - cmdstanpy - INFO - Chain [1] done processing 19:51:34 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 141.30 MSE: 31953.40 RMSE: 178.76 MAPE: 8.30% AIC: 1209.6325266563463 *** TEST *** MAE: 211.82 MSE: 75684.75 RMSE: 275.11 MAPE: 14.46%
19:51:35 - cmdstanpy - INFO - Chain [1] done processing 19:51:35 - cmdstanpy - INFO - Chain [1] start processing 19:51:35 - cmdstanpy - INFO - Chain [1] done processing 19:51:35 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 126.84 MSE: 25123.84 RMSE: 158.51 MAPE: 7.39% AIC: 1189.4337481491486 *** TEST *** MAE: 166.13 MSE: 42857.94 RMSE: 207.02 MAPE: 11.16% *** TRAIN *** MAE: 106.04 MSE: 17983.82 RMSE: 134.10 MAPE: 6.40% AIC: 1161.3487833789147 *** TEST *** MAE: 137.95 MSE: 47899.78 RMSE: 218.86 MAPE: 9.68%
19:51:35 - cmdstanpy - INFO - Chain [1] done processing 19:51:36 - cmdstanpy - INFO - Chain [1] start processing 19:51:36 - cmdstanpy - INFO - Chain [1] done processing 19:51:36 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.49 MSE: 18027.10 RMSE: 134.27 MAPE: 6.42% AIC: 1161.5507381736484 *** TEST *** MAE: 137.80 MSE: 46284.04 RMSE: 215.14 MAPE: 9.69% *** TRAIN *** MAE: 105.97 MSE: 17984.66 RMSE: 134.11 MAPE: 6.40% AIC: 1161.3527042991773 *** TEST *** MAE: 137.92 MSE: 47963.03 RMSE: 219.00 MAPE: 9.67%
19:51:36 - cmdstanpy - INFO - Chain [1] done processing 19:51:36 - cmdstanpy - INFO - Chain [1] start processing 19:51:36 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 106.50 MSE: 18027.48 RMSE: 134.27 MAPE: 6.42% AIC: 1161.5524758182455 *** TEST *** MAE: 138.20 MSE: 46500.07 RMSE: 215.64 MAPE: 9.71%
19:51:36 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.25 MSE: 17982.79 RMSE: 134.10 MAPE: 6.41% AIC: 1161.3440065418965 *** TEST *** MAE: 137.44 MSE: 47331.86 RMSE: 217.56 MAPE: 9.63%
19:51:36 - cmdstanpy - INFO - Chain [1] done processing 19:51:36 - cmdstanpy - INFO - Chain [1] start processing 19:51:37 - cmdstanpy - INFO - Chain [1] done processing 19:51:37 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.43 MSE: 18030.12 RMSE: 134.28 MAPE: 6.42% AIC: 1161.5647924541381 *** TEST *** MAE: 138.86 MSE: 47111.08 RMSE: 217.05 MAPE: 9.78% *** TRAIN *** MAE: 106.17 MSE: 17977.40 RMSE: 134.08 MAPE: 6.41% AIC: 1161.3188211637055 *** TEST *** MAE: 137.41 MSE: 47410.59 RMSE: 217.74 MAPE: 9.63%
19:51:37 - cmdstanpy - INFO - Chain [1] done processing 19:51:37 - cmdstanpy - INFO - Chain [1] start processing 19:51:37 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 106.05 MSE: 17881.67 RMSE: 133.72 MAPE: 6.39% AIC: 1160.8703092476076 *** TEST *** MAE: 137.73 MSE: 46769.56 RMSE: 216.26 MAPE: 9.71%
19:51:37 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.13 MSE: 17961.46 RMSE: 134.02 MAPE: 6.40% AIC: 1161.2442876564796 *** TEST *** MAE: 137.44 MSE: 47474.69 RMSE: 217.89 MAPE: 9.64%
19:51:37 - cmdstanpy - INFO - Chain [1] done processing 19:51:37 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.41 MSE: 18001.68 RMSE: 134.17 MAPE: 6.42% AIC: 1161.4321715226604 *** TEST *** MAE: 138.00 MSE: 46513.27 RMSE: 215.67 MAPE: 9.71%
19:51:37 - cmdstanpy - INFO - Chain [1] done processing 19:51:38 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.15 MSE: 17968.68 RMSE: 134.05 MAPE: 6.41% AIC: 1161.2780455124705 *** TEST *** MAE: 137.50 MSE: 47481.62 RMSE: 217.90 MAPE: 9.64%
19:51:38 - cmdstanpy - INFO - Chain [1] done processing 19:51:38 - cmdstanpy - INFO - Chain [1] start processing 19:51:38 - cmdstanpy - INFO - Chain [1] done processing 19:51:38 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.00 MSE: 17867.13 RMSE: 133.67 MAPE: 6.39% AIC: 1160.8019883422219 *** TEST *** MAE: 137.69 MSE: 46800.76 RMSE: 216.33 MAPE: 9.71% *** TRAIN *** MAE: 104.22 MSE: 17409.11 RMSE: 131.94 MAPE: 6.29% AIC: 1158.6205991769664 *** TEST *** MAE: 137.42 MSE: 50423.11 RMSE: 224.55 MAPE: 9.77%
19:51:38 - cmdstanpy - INFO - Chain [1] done processing 19:51:38 - cmdstanpy - INFO - Chain [1] start processing 19:51:38 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 103.95 MSE: 17323.51 RMSE: 131.62 MAPE: 6.27% AIC: 1158.2065347614514 *** TEST *** MAE: 141.38 MSE: 50892.06 RMSE: 225.59 MAPE: 10.04%
19:51:38 - cmdstanpy - INFO - Chain [1] start processing 19:51:39 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 104.21 MSE: 17408.77 RMSE: 131.94 MAPE: 6.29% AIC: 1158.6189284450898 *** TEST *** MAE: 137.12 MSE: 50268.29 RMSE: 224.21 MAPE: 9.75%
19:51:39 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 103.95 MSE: 17324.57 RMSE: 131.62 MAPE: 6.27% AIC: 1158.2116976583309 *** TEST *** MAE: 141.19 MSE: 50797.02 RMSE: 225.38 MAPE: 10.03%
19:51:39 - cmdstanpy - INFO - Chain [1] done processing 19:51:39 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 104.18 MSE: 17399.29 RMSE: 131.91 MAPE: 6.29% AIC: 1158.5731800224462 *** TEST *** MAE: 137.20 MSE: 50316.80 RMSE: 224.31 MAPE: 9.76%
19:51:39 - cmdstanpy - INFO - Chain [1] done processing 19:51:39 - cmdstanpy - INFO - Chain [1] start processing 19:51:40 - cmdstanpy - INFO - Chain [1] done processing 19:51:40 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 103.96 MSE: 17326.88 RMSE: 131.63 MAPE: 6.27% AIC: 1158.2228920325133 *** TEST *** MAE: 141.17 MSE: 50776.34 RMSE: 225.34 MAPE: 10.03% *** TRAIN *** MAE: 106.18 MSE: 17985.53 RMSE: 134.11 MAPE: 6.41% AIC: 1161.3567690185905 *** TEST *** MAE: 137.24 MSE: 47156.26 RMSE: 217.15 MAPE: 9.61%
19:51:40 - cmdstanpy - INFO - Chain [1] done processing 19:51:40 - cmdstanpy - INFO - Chain [1] start processing 19:51:40 - cmdstanpy - INFO - Chain [1] done processing 19:51:40 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.52 MSE: 18027.88 RMSE: 134.27 MAPE: 6.42% AIC: 1161.554372704546 *** TEST *** MAE: 138.52 MSE: 46710.96 RMSE: 216.13 MAPE: 9.74% *** TRAIN *** MAE: 106.18 MSE: 17982.64 RMSE: 134.10 MAPE: 6.41% AIC: 1161.3433143211632 *** TEST *** MAE: 137.55 MSE: 47477.13 RMSE: 217.89 MAPE: 9.64%
19:51:40 - cmdstanpy - INFO - Chain [1] done processing 19:51:40 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.50 MSE: 18027.44 RMSE: 134.27 MAPE: 6.42% AIC: 1161.552308945461 *** TEST *** MAE: 138.03 MSE: 46439.50 RMSE: 215.50 MAPE: 9.70%
19:51:40 - cmdstanpy - INFO - Chain [1] done processing 19:51:41 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.28 MSE: 17982.96 RMSE: 134.10 MAPE: 6.41% AIC: 1161.3447709548673 *** TEST *** MAE: 137.40 MSE: 47291.94 RMSE: 217.47 MAPE: 9.63%
19:51:41 - cmdstanpy - INFO - Chain [1] done processing 19:51:41 - cmdstanpy - INFO - Chain [1] start processing 19:51:41 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 106.54 MSE: 18027.82 RMSE: 134.27 MAPE: 6.42% AIC: 1161.55407501765 *** TEST *** MAE: 138.34 MSE: 46666.54 RMSE: 216.02 MAPE: 9.73% *** TRAIN *** MAE: 106.16 MSE: 17973.81 RMSE: 134.07 MAPE: 6.41% AIC: 1161.302058713311 *** TEST *** MAE: 137.43 MSE: 47430.47 RMSE: 217.79 MAPE: 9.63%
19:51:41 - cmdstanpy - INFO - Chain [1] start processing 19:51:41 - cmdstanpy - INFO - Chain [1] done processing 19:51:41 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 78.39 MSE: 10347.81 RMSE: 101.72 MAPE: 4.66% AIC: 1114.9222427207478 *** TEST *** MAE: 137.22 MSE: 40949.61 RMSE: 202.36 MAPE: 9.34%
19:51:42 - cmdstanpy - INFO - Chain [1] done processing 19:51:42 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.16 MSE: 17973.86 RMSE: 134.07 MAPE: 6.41% AIC: 1161.3022664464231 *** TEST *** MAE: 137.48 MSE: 47460.77 RMSE: 217.85 MAPE: 9.64%
19:51:42 - cmdstanpy - INFO - Chain [1] done processing 19:51:42 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.44 MSE: 17999.91 RMSE: 134.16 MAPE: 6.42% AIC: 1161.423904386426 *** TEST *** MAE: 138.07 MSE: 46524.80 RMSE: 215.70 MAPE: 9.71%
19:51:42 - cmdstanpy - INFO - Chain [1] done processing 19:51:42 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 106.12 MSE: 17960.55 RMSE: 134.02 MAPE: 6.40% AIC: 1161.2400553588784 *** TEST *** MAE: 137.26 MSE: 47378.64 RMSE: 217.67 MAPE: 9.62%
19:51:44 - cmdstanpy - INFO - Chain [1] done processing 19:51:44 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 73.85 MSE: 8935.99 RMSE: 94.53 MAPE: 4.38% AIC: 1102.6004215176854 *** TEST *** MAE: 138.75 MSE: 40876.14 RMSE: 202.18 MAPE: 9.58%
19:51:45 - cmdstanpy - INFO - Chain [1] done processing 19:51:45 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 104.22 MSE: 17407.91 RMSE: 131.94 MAPE: 6.29% AIC: 1158.6147988247596 *** TEST *** MAE: 137.41 MSE: 50412.70 RMSE: 224.53 MAPE: 9.77%
19:51:45 - cmdstanpy - INFO - Chain [1] done processing 19:51:45 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 76.03 MSE: 10065.00 RMSE: 100.32 MAPE: 4.52% AIC: 1112.594468393507 *** TEST *** MAE: 140.32 MSE: 40447.70 RMSE: 201.12 MAPE: 9.52%
19:51:45 - cmdstanpy - INFO - Chain [1] done processing 19:51:45 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 104.21 MSE: 17406.04 RMSE: 131.93 MAPE: 6.29% AIC: 1158.605747442009 *** TEST *** MAE: 137.34 MSE: 50371.84 RMSE: 224.44 MAPE: 9.77%
19:51:47 - cmdstanpy - INFO - Chain [1] done processing 19:51:47 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 67.21 MSE: 7282.93 RMSE: 85.34 MAPE: 4.00% AIC: 1085.417928875907 *** TEST *** MAE: 168.19 MSE: 56249.75 RMSE: 237.17 MAPE: 11.77%
19:51:47 - cmdstanpy - INFO - Chain [1] done processing 19:51:47 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN *** MAE: 104.20 MSE: 17404.68 RMSE: 131.93 MAPE: 6.29% AIC: 1158.5991978988607 *** TEST *** MAE: 137.32 MSE: 50369.69 RMSE: 224.43 MAPE: 9.77%
19:51:48 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN *** MAE: 38.17 MSE: 2551.88 RMSE: 50.52 MAPE: 2.28% AIC: 997.326940622734 *** TEST *** MAE: 228.10 MSE: 91715.28 RMSE: 302.85 MAPE: 15.66%
results_grid = pd.DataFrame(mape_lst)
results_grid
| param | mape_train | mape_test | |
|---|---|---|---|
| 0 | (False, False, 0.01, 10, additive) | 8.361144 | 13.662470 |
| 1 | (False, False, 0.01, 10, multiplicative) | 8.375450 | 13.342177 |
| 2 | (False, False, 0.01, 25, additive) | 8.362008 | 13.684633 |
| 3 | (False, False, 0.01, 25, multiplicative) | 8.374638 | 13.319802 |
| 4 | (False, False, 0.01, 50, additive) | 8.364424 | 13.737819 |
| ... | ... | ... | ... |
| 67 | (True, True, 0.1, 10, multiplicative) | 4.517779 | 9.520318 |
| 68 | (True, True, 0.1, 25, additive) | 6.290986 | 9.766588 |
| 69 | (True, True, 0.1, 25, multiplicative) | 4.002654 | 11.770559 |
| 70 | (True, True, 0.1, 50, additive) | 6.290383 | 9.765437 |
| 71 | (True, True, 0.1, 50, multiplicative) | 2.282970 | 15.656989 |
72 rows × 3 columns
results_grid['MAPE_Sum'] = results_grid['mape_train'] + results_grid['mape_test']
results_grid.sort_values('mape_test')
| param | mape_train | mape_test | MAPE_Sum | |
|---|---|---|---|---|
| 61 | (True, True, 0.05, 10, multiplicative) | 4.659438 | 9.341163 | 14.000601 |
| 67 | (True, True, 0.1, 10, multiplicative) | 4.517779 | 9.520318 | 14.038097 |
| 65 | (True, True, 0.05, 50, multiplicative) | 4.379537 | 9.578923 | 13.958460 |
| 54 | (True, True, 0.01, 10, additive) | 6.409693 | 9.608355 | 16.018049 |
| 64 | (True, True, 0.05, 50, additive) | 6.403911 | 9.623915 | 16.027826 |
| ... | ... | ... | ... | ... |
| 30 | (False, True, 0.1, 10, additive) | 8.301276 | 14.452606 | 22.753882 |
| 34 | (False, True, 0.1, 50, additive) | 8.301606 | 14.457283 | 22.758889 |
| 15 | (False, False, 0.1, 25, multiplicative) | 8.264668 | 14.514892 | 22.779560 |
| 13 | (False, False, 0.1, 10, multiplicative) | 8.264064 | 14.527584 | 22.791647 |
| 71 | (True, True, 0.1, 50, multiplicative) | 2.282970 | 15.656989 | 17.939958 |
72 rows × 4 columns
param = (True, False, 0.01, 50, "additive")
model = Prophet(growth='linear',
yearly_seasonality=True,
weekly_seasonality=param[0],
daily_seasonality=param[1],
interval_width=0.8,
changepoint_prior_scale=param[2],
n_changepoints=param[3],
seasonality_mode=param[4])
# Подготовка данных
data_used_model = data_hosp_MR_used.copy()
# Train/Test split
data_model_train = data_used_model.loc[data_used_model.Date < pd.to_datetime("2024-01-01")].rename(columns={'Date': 'ds', main_column: 'y'})
data_model_test = data_used_model.loc[data_used_model.Date >= pd.to_datetime("2024-01-01")].rename(columns={'Date': 'ds', main_column: 'y'})
model, data_check_metrics, forecast = arima_prophet_functions.prophet_model_train(model, data_model_train, data_model_test, main_column=main_column)
metrics_df = arima_prophet_functions.calculate_model_metrics_prophet(data_check_metrics, main_column)
arima_prophet_functions.calcualate_model_performance_prophet(model, data_check_metrics, main_column)
19:51:48 - cmdstanpy - INFO - Chain [1] start processing 19:51:49 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 106.25
MSE: 17982.79
RMSE: 134.10
MAPE: 6.41%
AIC: 1161.3440065418965
*** TEST ***
MAE: 137.44
MSE: 47331.86
RMSE: 217.56
MAPE: 9.63%
Ljung-Box Test
lb_stat lb_pvalue
10 36.978557 0.000057
metrics_df
| Dataset | TRAIN | TEST |
|---|---|---|
| MAE | 106.247953 | 137.438717 |
| MSE | 17982.792915 | 47331.864539 |
| RMSE | 134.099936 | 217.558876 |
| MAPE (%) | 6.409770 | 9.631075 |
| AIC | 1161.344007 | NaN |
arima_prophet_functions.plot_model_results_prophet(data_check_metrics, main_column,
date_column='Date_Graph',
prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
cutoff_date="12-30-2023", vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None)
import pandas as pd
import matplotlib.dates as mdates
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
import warnings
import itertools
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import het_breuschpagan
### ARIMA Functions
def train_arima_model(data, order, train_month=84, seasonal_order=None, exog=None, trend=None):
"""
Function to train ARIMA/ARIMAX/SARIMAX models.
Parameters:
data (pd.Series): Time series data (index should be datetime-like).
order (tuple): ARIMA order (p, d, q).
seasonal_order (tuple, optional): Seasonal order (P, D, Q, s).
exog (pd.DataFrame, optional): External regressors for ARIMAX.
train_split (float or int): Fraction or number of data points for training.
trend (str, optional): Trend component ('n', 'c', 't', 'ct').
Returns:
dict: A dictionary containing the model, predictions, and performance metrics.
"""
# Split the data into training and testing sets
train_data = data.iloc[:train_month]
test_data = data.iloc[train_month:]
train_size = train_data.shape[0]
if exog is not None:
exog_train = np.array(exog.iloc[:train_size])
exog_test = np.array(exog.iloc[train_size:])
else:
exog_train, exog_test = None, None
# Define the model
model = SARIMAX(train_data,
order=order,
seasonal_order=seasonal_order if seasonal_order else (0, 0, 0, 0),
exog=exog_train,
trend=trend,
enforce_stationarity=False,
enforce_invertibility=False)
# Fit the model
model = model.fit()
# In-sample predictions
train_predictions = model.fittedvalues
return model
def make_metrics_dataframe(model, data_model_train, data_model_test, main_column, exog_test=None, plotting=True, *args, **kwargs):
# Create a DataFrame for future predictions (extend the time range if necessary)
forecast_steps = data_model_test.shape[0] # Number of periods to forecast
forecast = model.get_forecast(steps=forecast_steps, exog=exog_test)
forecast_values = forecast.predicted_mean
forecast_ci = forecast.conf_int()
# Make final dataset
train_size = data_model_train.shape[0]
test_size = data_model_test.shape[0]
data_check_metrics = pd.concat([data_model_train, data_model_test])
data_check_metrics = data_check_metrics[['Date', 'Date_Graph', main_column]]
data_check_metrics.loc[:, f"{main_column}_Fitted"] = model.fittedvalues
data_check_metrics.loc[:, f"{main_column}_Prediction"] = pd.concat([pd.Series([None]*train_size), forecast_values])
data_check_metrics.loc[:, f"{main_column}_Prediction_CI_low"] = pd.concat([pd.Series([None]*train_size), forecast_ci[f'lower {main_column}']])
data_check_metrics.loc[:, f"{main_column}_Prediction_CI_upp"] = pd.concat([pd.Series([None]*train_size), forecast_ci[f'upper {main_column}']])
data_check_metrics = data_check_metrics.loc[data_check_metrics.Date <= pd.to_datetime("2024-10-10")]
return data_check_metrics
def calculate_model_metrics(model, data_check_metrics, main_column):
metrics = [] # List to store metrics for TRAIN and TEST
# Calculate metrics - TRAIN
mae_train = np.mean(np.abs(data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]))
mse_train = np.mean((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]) ** 2)
rmse_train = np.sqrt(mse_train)
mape_train = np.mean(np.abs((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]) / data_check_metrics[main_column])) * 100
aic_train = model.aic
# Store TRAIN metrics in the list
metrics.append({
"Dataset": "TRAIN",
"MAE": mae_train,
"MSE": mse_train,
"RMSE": rmse_train,
"MAPE (%)": mape_train,
"AIC": aic_train
})
# Calculate metrics - TEST
mae_test = np.mean(np.abs(data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]))
mse_test = np.mean((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]) ** 2)
rmse_test = np.sqrt(mse_test)
mape_test = np.mean(np.abs((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]) / data_check_metrics[main_column])) * 100
# Store TEST metrics in the list
metrics.append({
"Dataset": "TEST",
"MAE": mae_test,
"MSE": mse_test,
"RMSE": rmse_test,
"MAPE (%)": mape_test,
"AIC": None
})
# Convert metrics to a DataFrame
metrics_df = pd.DataFrame(metrics)
metrics_df.index = metrics_df['Dataset']
metrics_df.drop(columns=['Dataset'], inplace=True)
metrics_df = metrics_df.T
# Print the metrics
print("*** TRAIN ***")
print(f"MAE: {mae_train:.2f}")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAPE: {mape_train:.2f}%")
print(f"AIC: {aic_train}")
print("\n*** TEST ***")
print(f"MAE: {mae_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAPE: {mape_test:.2f}%")
return metrics_df
def model_performance(data_check_metrics, main_column):
# Assume `df` is your DataFrame with columns 'real' and 'fitted'
train_size = data_check_metrics[f"{main_column}_Fitted"].dropna().shape[0]
# Replace 'real' and 'fitted' with your actual column names
real = data_check_metrics[main_column].iloc[0:train_size]
fitted = data_check_metrics[f"{main_column}_Fitted"].iloc[0:train_size]
# Calculate residuals
data_check_metrics['residuals'] = real - fitted
residuals = data_check_metrics['residuals'].dropna()
from statsmodels.stats.diagnostic import acorr_ljungbox
# Perform the Ljung-Box test on residuals
# df['residuals'] should already be computed
ljung_box_results = acorr_ljungbox(data_check_metrics['residuals'].dropna(), lags=[10], return_df=True)
# Display results
print('\n')
print('Ljung-Box Test')
print(ljung_box_results)
# test_stat, p_value, _, _ = het_breuschpagan(residuals, exog_het=np.arange(len(residuals)))
# # Display results
# print('\n')
# print('Breusch-Pagan Test')
# print(test_stat, p_value)
# Residuals plot
plt.figure(figsize=(10, 6))
plt.plot(residuals, label='Residuals', color='blue')
plt.axhline(0, color='red', linestyle='--', linewidth=1)
plt.title('Residuals Over Time')
plt.xlabel('Time')
plt.ylabel('Residuals')
plt.legend()
plt.show()
# Histogram of residuals
plt.figure(figsize=(8, 5))
plt.hist(residuals, bins=30, color='gray', edgecolor='black')
plt.title('Histogram of Residuals')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.show()
# ACF and PACF of residuals
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plot_acf(residuals, lags=10, ax=plt.gca(), title="ACF of Residuals")
plt.subplot(1, 2, 2)
plot_pacf(residuals, lags=10, ax=plt.gca(), title="PACF of Residuals")
plt.tight_layout()
plt.show()
def rolling_prediction_function(data_used_model, main_column, arima_order, seasonal_order=None, train_month_start=84, train_month_end=94, exog=None):
data_used_train = data_used_model.iloc[:train_month_start]
data_used_test = data_used_model.iloc[train_month_start:]
rolling_prediction = []
rolling_prediction_ci_low = []
rolling_prediction_ci_upp = []
model_first = train_arima_model(data_used_model[main_column], arima_order, train_month=train_month_start, exog=exog, seasonal_order=seasonal_order)
if exog is not None:
first_data_check_metrics = make_metrics_dataframe(model_first, data_used_train, data_used_test, main_column, exog_test=exog.loc[data_used_test.index])
else:
first_data_check_metrics = make_metrics_dataframe(model_first, data_used_train, data_used_test, main_column)
for train_month in range(train_month_start, train_month_end):
#print(train_month)
model = train_arima_model(data_used_model[main_column], arima_order, train_month=train_month, seasonal_order=seasonal_order, exog=exog)
if exog is not None:
data_check_metrics = make_metrics_dataframe(model, data_used_train, data_used_test, main_column, exog_test=np.array(exog.iloc[train_month_start:]))
else:
data_check_metrics = make_metrics_dataframe(model, data_used_train, data_used_test, main_column, exog_test=None)
rolling_prediction.append(data_check_metrics[f"{main_column}_Prediction"].dropna().iloc[0])
rolling_prediction_ci_low.append(data_check_metrics[f"{main_column}_Prediction_CI_low"].dropna().iloc[0])
rolling_prediction_ci_upp.append(data_check_metrics[f"{main_column}_Prediction_CI_upp"].dropna().iloc[0])
final_data_check_metrics = first_data_check_metrics.copy()
final_data_check_metrics[f"{main_column}_Prediction"] = pd.concat([pd.Series([None]*train_month_start), pd.Series(rolling_prediction)], ignore_index=True)
final_data_check_metrics[f"{main_column}_Prediction_CI_low"] = pd.concat([pd.Series([None]*train_month_start), pd.Series(rolling_prediction_ci_low)], ignore_index=True)
final_data_check_metrics[f"{main_column}_Prediction_CI_upp"] = pd.concat([pd.Series([None]*train_month_start), pd.Series(rolling_prediction_ci_upp)], ignore_index=True)
return final_data_check_metrics, model_first
def grid_search_arima(data, min_p, max_p, min_q, max_q, d,
min_P, max_P, min_Q, max_Q,
min_D, max_D, seasonal_period, exog=None,
trends=None):
"""
Perform a grid search to find the best ARIMA/SARIMA model.
Parameters:
data (pd.Series): Time series data (index should be datetime-like).
min_p (int): Minimum value for p (AR order).
max_p (int): Maximum value for p (AR order).
min_q (int): Minimum value for q (MA order).
max_q (int): Maximum value for q (MA order).
d (int): Differencing order.
min_P (int): Minimum value for seasonal P (seasonal AR order).
max_P (int): Maximum value for seasonal P (seasonal AR order).
min_Q (int): Minimum value for seasonal Q (seasonal MA order).
max_Q (int): Maximum value for seasonal Q (seasonal MA order).
min_D (int): Minimum value for seasonal D (seasonal differencing).
max_D (int): Maximum value for seasonal D (seasonal differencing).
seasonal_period (int): Seasonal period (s).
exog (pd.DataFrame, optional): External regressors for ARIMAX.
Returns:
pd.DataFrame: Grid search results with model orders and AIC values.
"""
results = []
if not trends:
trends = ['n', 'c', 't', 'ct'] # Trend options
# Generate parameter grid
for p, q, P, Q, D, trend in itertools.product(
range(min_p, max_p + 1),
range(min_q, max_q + 1),
range(min_P, max_P + 1),
range(min_Q, max_Q + 1),
range(min_D, max_D + 1),
trends
):
try:
# Fit SARIMAX model
model = train_arima_model(data, order=(p, d, q), seasonal_order=(P, D, Q, seasonal_period), trend=trend,
exog=exog)
# Store results
results.append({
'p': p,
'q': q,
'd': d,
'P': P,
'Q': Q,
'D': D,
'seasonal_period': seasonal_period,
'trend': trend,
'AIC': model.aic
})
except Exception as e:
print(p,q,d,P,Q,D)
print('error')
continue
# Convert results to DataFrame
results_df = pd.DataFrame(results)
return results_df
def plot_model_results(data, main_column, date_column='Date_Graph',
prediction_column=None, fitted_column=None,
ci_low_column=None, ci_upp_column=None,
cutoff_date=None, vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None,
figsize=None):
"""
Plot Prophet model results, including actual, fitted, and predicted values with confidence intervals.
Parameters:
data (pd.DataFrame): DataFrame containing the time series data.
main_column (str): Column name for the actual values.
date_column (str): Column name for the date. Default is 'Date_Graph'.
prediction_column (str): Column name for predictions.
fitted_column (str): Column name for fitted values.
ci_low_column (str): Column name for lower confidence interval.
ci_upp_column (str): Column name for upper confidence interval.
cutoff_date (str): Date to filter the data for predictions. Format: 'YYYY-MM-DD'.
vertical_line_date (str): Date for a vertical line. Format: 'YYYY-MM-DD'.
vertical_line_ymin (float): Y-axis minimum for the vertical line.
vertical_line_ymax (float): Y-axis maximum for the vertical line.
Returns:
None: Displays the plot.
"""
if figsize is None:
figsize = (10, 6)
# Ensure date_column is in datetime format
data[date_column] = pd.to_datetime(data[date_column], errors='coerce')
# Plot the data
plt.figure(figsize=figsize)
plt.plot(data[date_column], data[main_column], label='Actual', color="orange", alpha=0.7)
if fitted_column:
plt.plot(data[date_column], data[fitted_column], label='Fitted', color='blue', linestyle="dashed", linewidth=3)
if prediction_column:
plt.plot(data[date_column], data[prediction_column], label='Predicted', linestyle="dashdot", linewidth=3, color='red')
if vertical_line_date:
plt.vlines(x=pd.to_datetime(vertical_line_date), linestyles='--', color='orange', ymin=vertical_line_ymin, ymax=vertical_line_ymax)
# Add titles and labels
plt.title('Model - Actual vs Fitted and Forecast')
plt.xlabel('Date')
plt.ylabel('Load')
plt.ylim(data[main_column].min() - 200, data[main_column].max() + 200)
plt.legend()
plt.grid()
# Xticks
xticks = data[date_column][::2] # Select every second tick
plt.xticks(xticks, xticks.dt.strftime('%Y-%m-%d'), rotation=45)
plt.show()
if cutoff_date:
# Filter the dataset for plotting predictions
filtered_data = data.loc[data[date_column] > pd.to_datetime(cutoff_date)]
if prediction_column:
filtered_data[prediction_column] = filtered_data[prediction_column].fillna(filtered_data[fitted_column])
# Plot the filtered data
plt.figure(figsize=figsize)
plt.plot(filtered_data[date_column], filtered_data[main_column], label='Actual', color="orange", alpha=0.7)
if prediction_column:
plt.plot(filtered_data[date_column], filtered_data[prediction_column], label='Predicted', linestyle="dashdot", linewidth=3, color='red')
if ci_low_column and ci_upp_column:
filtered_data.loc[filtered_data["Date"] == pd.to_datetime("2023-12-01"),
f"{main_column}_Prediction"] = data.loc[data["Date"] == pd.to_datetime("2023-12-01"), f"{main_column}_Fitted"]
filtered_data = filtered_data.fillna(1500)
#print(filtered_data.head())
plt.fill_between(filtered_data[date_column],
filtered_data[ci_low_column],
filtered_data[ci_upp_column],
color='pink', alpha=0.3, label='Confidence Interval')
# Add titles and labels
plt.title('Model - Actual vs Forecast (Filtered)')
plt.xlabel('Date')
plt.ylabel('Load')
if ci_low_column and ci_upp_column:
plt.ylim(filtered_data[ci_low_column].min() - 100, filtered_data[ci_upp_column].max() + 100)
else:
plt.ylim(filtered_data[main_column].min() - 200, filtered_data[main_column].max() + 200)
plt.legend()
plt.grid()
# Xticks
xticks = filtered_data[date_column][::2]
plt.xticks(xticks, xticks.dt.strftime('%Y-%m-%d'), rotation=45)
plt.show()
### Prophet Functions
def calculate_aic(col_true, col_prediction, num_params=50):
# To common indexes
indexes = set(col_true.dropna().index) & set(col_prediction.dropna().index)
col_true = col_true[indexes]
col_prediction = col_prediction[indexes]
# Calculate residuals and variance
residuals = col_true.values - col_prediction.values
rss = np.sum(residuals ** 2)
sigma2 = np.var(residuals)
# Number of parameters
k = num_params
# Number of observations
n = len(col_true.values)
#Likelihood
log_likelihood = -n / 2 * (np.log(2 * np.pi) + np.log(sigma2) + rss / (n * sigma2))
# AIC
aic = 2 * k - 2 * log_likelihood
return aic
def prophet_model_train(model, data_model_train, data_model_test, main_column, plotting=True, *args, **kwargs):
# Initialize and fit the Prophet model
model.fit(data_model_train)
# Create a DataFrame for future predictions (extend the time range if necessary)
future = model.make_future_dataframe(periods=data_model_test.shape[0], freq='M') # Forecast 10 months into the future
forecast = model.predict(future)
if plotting:
# Plot forecast components (trend, seasonality, etc.)
fig = model.plot_components(forecast, figsize=(6, 6))
plt.xticks(rotation=45)
plt.show()
# Calculate Metrics
data_check_metrics = pd.concat([data_model_train, data_model_test])
data_check_metrics['Date'] = data_check_metrics['ds']
data_check_metrics[main_column] = data_check_metrics['y']
data_check_metrics = data_check_metrics[['Date', 'Date_Graph', main_column]]
data_check_metrics = data_check_metrics.reset_index()
train_size = data_model_train.shape[0]
test_size = data_model_test.shape[0]
# Making dataset to calculate metrics
fitted = forecast.iloc[0:train_size]['yhat']
data_check_metrics[f"{main_column}_Fitted"] = pd.concat([fitted, pd.Series([None]*test_size)]).values
# Add Predictions and Confidence Intervals
predicted = forecast.iloc[train_size:data_check_metrics.shape[0]]['yhat']
data_check_metrics[f"{main_column}_Prediction"] = pd.concat([pd.Series([None]*train_size), predicted]).values
data_check_metrics[f"{main_column}_Prediction_CI_low"] = forecast['yhat_lower'].values
data_check_metrics[f"{main_column}_Prediction_CI_upp"] = forecast['yhat_upper'].values
return model, data_check_metrics, forecast
def calculate_model_metrics_prophet(data_check_metrics, main_column):
metrics = [] # List to store metrics for TRAIN and TEST
# Calculate metrics - TRAIN
mae_train = np.mean(np.abs(data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]))
mse_train = np.mean((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]) ** 2)
rmse_train = np.sqrt(mse_train)
mape_train = np.mean(np.abs((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]) / data_check_metrics[main_column])) * 100
aic_train = calculate_aic(data_check_metrics[main_column], data_check_metrics[f"{main_column}_Fitted"])
# Store TRAIN metrics in the list
metrics.append({
"Dataset": "TRAIN",
"MAE": mae_train,
"MSE": mse_train,
"RMSE": rmse_train,
"MAPE (%)": mape_train,
"AIC": aic_train
})
# Calculate metrics - TEST
mae_test = np.mean(np.abs(data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]))
mse_test = np.mean((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]) ** 2)
rmse_test = np.sqrt(mse_test)
mape_test = np.mean(np.abs((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]) / data_check_metrics[main_column])) * 100
# Store TEST metrics in the list
metrics.append({
"Dataset": "TEST",
"MAE": mae_test,
"MSE": mse_test,
"RMSE": rmse_test,
"MAPE (%)": mape_test,
"AIC": None
})
# Convert metrics to a DataFrame
metrics_df = pd.DataFrame(metrics)
metrics_df.index = metrics_df['Dataset']
metrics_df.drop(columns=['Dataset'], inplace=True)
metrics_df = metrics_df.T
# Print the metrics
print("*** TRAIN ***")
print(f"MAE: {mae_train:.2f}")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAPE: {mape_train:.2f}%")
print(f"AIC: {aic_train}")
print("\n*** TEST ***")
print(f"MAE: {mae_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAPE: {mape_test:.2f}%")
return metrics_df
def calcualate_model_performance_prophet(model, data_check_metrics, main_column):
# Assume `df` is your DataFrame with columns 'real' and 'fitted'
train_size = data_check_metrics[f"{main_column}_Fitted"].dropna().shape[0]
# Replace 'real' and 'fitted' with your actual column names
real = data_check_metrics[main_column].iloc[0:train_size]
fitted = data_check_metrics[f"{main_column}_Fitted"].iloc[0:train_size]
# Calculate residuals
data_check_metrics['residuals'] = real - fitted
residuals = data_check_metrics['residuals'].dropna()
from statsmodels.stats.diagnostic import acorr_ljungbox
# Perform the Ljung-Box test on residuals
# df['residuals'] should already be computed
ljung_box_results = acorr_ljungbox(data_check_metrics['residuals'].dropna(), lags=[10], return_df=True)
# Display results
print('\n')
print('Ljung-Box Test')
print(ljung_box_results)
# Residuals plot
plt.figure(figsize=(10, 6))
plt.plot(residuals, label='Residuals', color='blue')
plt.axhline(0, color='red', linestyle='--', linewidth=1)
plt.title('Residuals Over Time')
plt.xlabel('Time')
plt.ylabel('Residuals')
plt.legend()
plt.show()
# Histogram of residuals
plt.figure(figsize=(8, 5))
plt.hist(residuals, bins=30, color='gray', edgecolor='black')
plt.title('Histogram of Residuals')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.show()
# ACF and PACF of residuals
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plot_acf(residuals, lags=10, ax=plt.gca(), title="ACF of Residuals")
plt.subplot(1, 2, 2)
plot_pacf(residuals, lags=10, ax=plt.gca(), title="PACF of Residuals")
plt.tight_layout()
plt.show()
def plot_model_results_prophet(data, main_column, date_column='Date_Graph',
prediction_column=None, fitted_column=None,
ci_low_column=None, ci_upp_column=None,
cutoff_date=None, vertical_line_date=None,
vertical_line_ymin=None, vertical_line_ymax=None,
figsize=None):
"""
Plot Prophet model results, including actual, fitted, and predicted values with confidence intervals.
Parameters:
data (pd.DataFrame): DataFrame containing the time series data.
main_column (str): Column name for the actual values.
date_column (str): Column name for the date. Default is 'Date_Graph'.
prediction_column (str): Column name for predictions.
fitted_column (str): Column name for fitted values.
ci_low_column (str): Column name for lower confidence interval.
ci_upp_column (str): Column name for upper confidence interval.
cutoff_date (str): Date to filter the data for predictions. Format: 'YYYY-MM-DD'.
vertical_line_date (str): Date for a vertical line. Format: 'YYYY-MM-DD'.
vertical_line_ymin (float): Y-axis minimum for the vertical line.
vertical_line_ymax (float): Y-axis maximum for the vertical line.
Returns:
None: Displays the plot.
"""
if figsize is None:
figsize = (10, 6)
# Ensure date_column is in datetime format
data[date_column] = pd.to_datetime(data[date_column], errors='coerce')
# Plot the data
plt.figure(figsize=figsize)
plt.plot(data[date_column], data[main_column], label='Actual', color="orange", alpha=0.7)
if fitted_column:
plt.plot(data[date_column], data[fitted_column], label='Fitted', color='blue', linestyle="dashed", linewidth=3)
if prediction_column:
plt.plot(data[date_column], data[prediction_column], label='Predicted', linestyle="dashdot", linewidth=3, color='red')
if vertical_line_date:
plt.vlines(x=pd.to_datetime(vertical_line_date), linestyles='--', color='orange', ymin=vertical_line_ymin, ymax=vertical_line_ymax)
# Add titles and labels
plt.title('Model - Actual vs Fitted and Forecast')
plt.xlabel('Date')
plt.ylabel('Load')
plt.ylim(data[main_column].min() - 200, data[main_column].max() + 200)
plt.legend()
plt.grid()
# Xticks
xticks = data[date_column][::2] # Select every second tick
plt.xticks(xticks, xticks.dt.strftime('%Y-%m-%d'), rotation=45)
plt.show()
if cutoff_date:
# Filter the dataset for plotting predictions
filtered_data = data.loc[data[date_column] > pd.to_datetime("2023-11-30")]
if prediction_column:
filtered_data[prediction_column] = filtered_data[prediction_column].fillna(filtered_data[fitted_column])
# Plot the filtered data
plt.figure(figsize=figsize)
plt.plot(filtered_data[date_column], filtered_data[main_column], label='Actual', color="orange", alpha=0.7)
if prediction_column:
plt.plot(filtered_data[date_column], filtered_data[prediction_column], label='Predicted', linestyle="dashdot", linewidth=3, color='red')
if ci_low_column and ci_upp_column:
filtered_data.loc[filtered_data["Date"] == pd.to_datetime("2023-12-01"),
f"{main_column}_Prediction"] = data.loc[data["Date"] == pd.to_datetime("2023-12-01"), f"{main_column}_Fitted"]
filtered_data = filtered_data.fillna(1500)
#print(filtered_data.head())
plt.fill_between(filtered_data[date_column],
filtered_data[ci_low_column],
filtered_data[ci_upp_column],
color='pink', alpha=0.3, label='Confidence Interval')
# Add titles and labels
plt.title('Model - Actual vs Forecast (Filtered)')
plt.xlabel('Date')
plt.ylabel('Load')
if ci_low_column and ci_upp_column:
plt.ylim(filtered_data[ci_low_column].min() - 100, filtered_data[ci_upp_column].max() + 100)
else:
plt.ylim(filtered_data[main_column].min() - 200, filtered_data[main_column].max() + 200)
plt.legend()
plt.grid()
# Xticks
xticks = filtered_data[date_column][::2]
plt.xticks(xticks, xticks.dt.strftime('%Y-%m-%d'), rotation=45)
plt.show()